{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 垃圾邮件分类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "SPAM_PATH=os.path.join(\"datasets\",\"spam\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 我们可以使用python的“email”模块解析这些电子邮件（它处理邮件头、编码等）\n",
    "import email\n",
    "import email.policy\n",
    "def load_email(is_spam,filename,spam_path=SPAM_PATH):\n",
    "    directory=\"spam\" if is_spam else \"easy_ham\"\n",
    "    with open(os.path.join(spam_path,directory,filename),\"rb\") as f:\n",
    "        return email.parser.BytesParser(policy=email.policy.default).parse(f)\n",
    "    \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Martin A posted:\n",
      "Tassos Papadopoulos, the Greek sculptor behind the plan, judged that the\n",
      " limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the\n",
      " Mount Athos monastic community, was ideal for the patriotic sculpture. \n",
      " \n",
      " As well as Alexander's granite features, 240 ft high and 170 ft wide, a\n",
      " museum, a restored amphitheatre and car park for admiring crowds are\n",
      "planned\n",
      "---------------------\n",
      "So is this mountain limestone or granite?\n",
      "If it's limestone, it'll weather pretty fast.\n",
      "\n",
      "------------------------ Yahoo! Groups Sponsor ---------------------~-->\n",
      "4 DVDs Free +s&p Join Now\n",
      "http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM\n",
      "---------------------------------------------------------------------~->\n",
      "\n",
      "To unsubscribe from this group, send an email to:\n",
      "forteana-unsubscribe@egroups.com\n",
      "\n",
      " \n",
      "\n",
      "Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/\n"
     ]
    }
   ],
   "source": [
    "#获取正常邮件和垃圾邮件\n",
    "\n",
    "HAM_DIR = os.path.join(SPAM_PATH, \"easy_ham\")\n",
    "SPAM_DIR = os.path.join(SPAM_PATH, \"spam\")\n",
    "\n",
    "ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 20]\n",
    "spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 20]\n",
    "\n",
    "# 让我们看一个ham示例和一个spam示例，了解数据的外观\n",
    "ham_emails =[load_email(is_spam=False,filename=name) for name in ham_filenames]\n",
    "spam_emails = [load_email(is_spam=True,filename=name) for name in spam_filenames]\n",
    "\n",
    "print(ham_emails[1].get_content().strip())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1) Fight The Risk of Cancer!\n",
      "http://www.adclick.ws/p.cfm?o=315&s=pk007\n",
      "\n",
      "2) Slim Down - Guaranteed to lose 10-12 lbs in 30 days\n",
      "http://www.adclick.ws/p.cfm?o=249&s=pk007\n",
      "\n",
      "3) Get the Child Support You Deserve - Free Legal Advice\n",
      "http://www.adclick.ws/p.cfm?o=245&s=pk002\n",
      "\n",
      "4) Join the Web's Fastest Growing Singles Community\n",
      "http://www.adclick.ws/p.cfm?o=259&s=pk007\n",
      "\n",
      "5) Start Your Private Photo Album Online!\n",
      "http://www.adclick.ws/p.cfm?o=283&s=pk007\n",
      "\n",
      "Have a Wonderful Day,\n",
      "Offer Manager\n",
      "PrizeMama\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "If you wish to leave this list please use the link below.\n",
      "http://www.qves.com/trim/?ilug@linux.ie%7C17%7C114258\n",
      "\n",
      "\n",
      "-- \n",
      "Irish Linux Users' Group: ilug@linux.ie\n",
      "http://www.linux.ie/mailman/listinfo/ilug for (un)subscription information.\n",
      "List maintainer: listmaster@linux.ie\n"
     ]
    }
   ],
   "source": [
    "print(spam_emails[1].get_content().strip())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 电子邮件实际上有很多部分，带有图像和附件（它们可以有自己的附件）。查看邮件的各种类型的结构：\n",
    "def get_email_structure(email):\n",
    "    if isinstance(email,str):\n",
    "        return email\n",
    "    payload=email.get_payload()\n",
    "    if isinstance(payload,list):\n",
    "        return \"multipart({})\".format(\",\".join([get_email_structure(sub_email) for sub_email in payload]))\n",
    "    else:\n",
    "        return email.get_content_type()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import Counter\n",
    "\n",
    "def structures_counter(emails):\n",
    "    structures =Counter()\n",
    "    for email in emails:\n",
    "        structure = get_email_structure(email)\n",
    "        structures[structure] +=1\n",
    "    return structures"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('text/plain', 2408),\n",
       " ('multipart(text/plain,application/pgp-signature)', 66),\n",
       " ('multipart(text/plain,text/html)', 8),\n",
       " ('multipart(text/plain,text/plain)', 4),\n",
       " ('multipart(text/plain)', 3),\n",
       " ('multipart(text/plain,application/octet-stream)', 2),\n",
       " ('multipart(text/plain,text/enriched)', 1),\n",
       " ('multipart(text/plain,application/ms-tnef,text/plain)', 1),\n",
       " ('multipart(multipart(text/plain,text/plain,text/plain),application/pgp-signature)',\n",
       "  1),\n",
       " ('multipart(text/plain,video/mng)', 1),\n",
       " ('multipart(text/plain,multipart(text/plain))', 1),\n",
       " ('multipart(text/plain,application/x-pkcs7-signature)', 1),\n",
       " ('multipart(text/plain,multipart(text/plain,text/plain),text/rfc822-headers)',\n",
       "  1),\n",
       " ('multipart(text/plain,multipart(text/plain,text/plain),multipart(multipart(text/plain,application/x-pkcs7-signature)))',\n",
       "  1),\n",
       " ('multipart(text/plain,application/x-java-applet)', 1)]"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "structures_counter(ham_emails).most_common()  #正常邮件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('text/plain', 218),\n",
       " ('text/html', 183),\n",
       " ('multipart(text/plain,text/html)', 45),\n",
       " ('multipart(text/html)', 20),\n",
       " ('multipart(text/plain)', 19),\n",
       " ('multipart(multipart(text/html))', 5),\n",
       " ('multipart(text/plain,image/jpeg)', 3),\n",
       " ('multipart(text/html,application/octet-stream)', 2),\n",
       " ('multipart(text/plain,application/octet-stream)', 1),\n",
       " ('multipart(text/html,text/plain)', 1),\n",
       " ('multipart(multipart(text/html),application/octet-stream,image/jpeg)', 1),\n",
       " ('multipart(multipart(text/plain,text/html),image/gif)', 1),\n",
       " ('multipart/alternative', 1)]"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "structures_counter(spam_emails).most_common()  #垃圾邮件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Return-Path : <12a1mailbot1@web.de>\n",
      "Delivered-To : zzzz@localhost.spamassassin.taint.org\n",
      "Received : from localhost (localhost [127.0.0.1])\tby phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 136B943C32\tfor <zzzz@localhost>; Thu, 22 Aug 2002 08:17:21 -0400 (EDT)\n",
      "Received : from mail.webnote.net [193.120.211.219]\tby localhost with POP3 (fetchmail-5.9.0)\tfor zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST)\n",
      "Received : from dd_it7 ([210.97.77.167])\tby webnote.net (8.9.3/8.9.3) with ESMTP id NAA04623\tfor <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 13:09:41 +0100\n",
      "From : 12a1mailbot1@web.de\n",
      "Received : from r-smtp.korea.com - 203.122.2.197 by dd_it7  with Microsoft SMTPSVC(5.5.1775.675.6);\t Sat, 24 Aug 2002 09:42:10 +0900\n",
      "To : dcek1a1@netsgo.com\n",
      "Subject : Life Insurance - Why Pay More?\n",
      "Date : Wed, 21 Aug 2002 20:31:57 -1600\n",
      "MIME-Version : 1.0\n",
      "Message-ID : <0103c1042001882DD_IT7@dd_it7>\n",
      "Content-Type : text/html; charset=\"iso-8859-1\"\n",
      "Content-Transfer-Encoding : quoted-printable\n"
     ]
    }
   ],
   "source": [
    "# 正常邮件更多的是纯文本，而垃圾邮件有相当多的HTML。此外，相当多的ham电子邮件使用pgp签名，而没有垃圾邮件。\n",
    "# 查看邮件头\n",
    "for header,value in spam_emails[0].items():\n",
    "    print(header,\":\",value)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Life Insurance - Why Pay More?'"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 里面可能有很多有用的信息，比如发件人的电子邮件地址（12a1mailbot1@web.de看起来很可疑），\n",
    "# 查看“主题”标题：\n",
    "spam_emails[0][\"Subject\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 拆分训练集合测试集合\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "X=np.array(ham_emails +spam_emails)\n",
    "y=np.array([0]*len(ham_emails) +[1]*len(spam_emails))\n",
    "\n",
    "X_train,X_test,y_train,y_test =train_test_split(X,y,test_size=0.2,random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array([<email.message.EmailMessage object at 0x000001C66DF72B88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DFDD108>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DBE7148>, ...,\n",
       "        <email.message.EmailMessage object at 0x000001C66E39C508>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E537208>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E107A08>],\n",
       "       dtype=object),\n",
       " array([<email.message.EmailMessage object at 0x000001C66E8948C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E442848>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E8A3788>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DC26088>,\n",
       "        <email.message.EmailMessage object at 0x000001C66B54CF08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E3765C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E63F348>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E9B8848>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DD8D448>,\n",
       "        <email.message.EmailMessage object at 0x000001C66BBE0408>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EA3BE48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DAADF88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DF76348>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E86D948>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D272EC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E150808>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E2F6488>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DE0EBC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D166188>,\n",
       "        <email.message.EmailMessage object at 0x000001C66B2BA848>,\n",
       "        <email.message.EmailMessage object at 0x000001C66B5FA848>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DC83FC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E16B6C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E99CEC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E5BD048>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D6F2208>,\n",
       "        <email.message.EmailMessage object at 0x000001C66CD387C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E491C88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DC69348>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E4B0448>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E26AB48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E839948>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E00A3C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D3FE488>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DC90CC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E053108>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E9D59C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DCC9F08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DC8E948>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E538048>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DBBF248>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DABBBC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DD6BF48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DFE8E48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E7B6308>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E74D548>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EACEBC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66B613888>,\n",
       "        <email.message.EmailMessage object at 0x000001C66B646148>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E7D7BC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E335048>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E9B5B48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DBD0648>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EAC9F88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DB6C908>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D7119C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E5BD108>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E04B508>,\n",
       "        <email.message.EmailMessage object at 0x000001C66CF10908>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E262A48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66B5C7788>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DF391C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E0E3208>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DC55D48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DAB5108>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E444F08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E4F9A48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E6E3F48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E7B4708>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DE7BF88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E8446C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66CE05F88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D1CBB08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E90EA48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DB07E08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DFD8BC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EA7DDC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D0A0888>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EA48A88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EB3D408>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E8BFB48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E8B2508>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E077B08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E8AA208>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E956888>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DB5BE08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66B69C488>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E583508>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EB32EC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D6A8C08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DCCA088>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E4E28C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EAC3E48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E4F48C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E866B48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DDA2208>,\n",
       "        <email.message.EmailMessage object at 0x000001C66CE08048>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EA57688>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DB919C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E858648>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E9EBE48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DAA4E08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DFBA6C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E7EFF88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E47DD88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E10DF88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E9322C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DCE1BC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DB012C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E82F148>,\n",
       "        <email.message.EmailMessage object at 0x000001C66CDA1C88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DE594C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D6F51C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E97BF48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DE40488>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E9CAFC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66B5E5888>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DCB7548>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E4F6748>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E91EB88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D374C48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E564AC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E0221C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E875D88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E454808>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E9EB708>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E947988>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E129CC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E03A708>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E1960C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EAA2608>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E5C9B88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E8A05C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E893908>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E19DB48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DDC7A88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DBC1708>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E86A348>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E2F9F08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EA82888>,\n",
       "        <email.message.EmailMessage object at 0x000001C66B69E9C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E4CF148>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EA73BC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D2C78C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EA1DA08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DDE6988>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E9E39C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EA9F888>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E962188>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DC5C088>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E8BFD88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D75D488>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E220708>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E9E9808>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DD06988>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E8E3AC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DBB06C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D1CB908>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E802E88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D232AC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E84B748>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E8D59C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DC2A308>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DEC7748>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E866648>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E5C6508>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DAD0D88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66CDC2688>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EA51A08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DE93E88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D6C2348>,\n",
       "        <email.message.EmailMessage object at 0x000001C66CFA9288>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E6FDA08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E0330C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D1C36C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E46DC08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EA31EC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E88AC48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DE9C788>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E863748>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DE03EC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66CF03308>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E8BD4C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D648E08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DB3DC48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DB8DE88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DBBBCC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EA66D88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E657608>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E841C48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E004EC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E2B6D88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E9321C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DE16388>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E2A37C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66CD37148>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DD2BE88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66B6A6FC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E0E56C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DC3BF48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E48C048>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E94CC88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DBE2B48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66CF13B48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E924FC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E5C6D88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E9D75C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E9D9808>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DEDEDC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EA0BD08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E9A98C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E83E188>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E4EAC08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E9C0508>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E077D88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DAEA688>,\n",
       "        <email.message.EmailMessage object at 0x000001C66B603A88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DD97B48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E8967C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EA12B88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E564848>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E791B08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E05F948>,\n",
       "        <email.message.EmailMessage object at 0x000001C66CDA35C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E63E848>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DDEF208>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DB55F48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EA22F88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E29D2C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DAF4248>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E673B08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E8B2408>,\n",
       "        <email.message.EmailMessage object at 0x000001C66B91B988>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DC08748>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DF2AA88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E2D04C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DEE0408>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E85B308>,\n",
       "        <email.message.EmailMessage object at 0x000001C66B6A7688>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E102F88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EB1B548>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E0AF4C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E970C48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E8E38C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DB58AC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DDD4048>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E415B88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DE358C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E12BBC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66B646048>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E30FC88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E5373C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DB98388>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E3355C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66CF9F748>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D402D48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E9329C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66CF10B08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E9A7B08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E9D9E88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E772F48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E83B0C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DB9D888>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E313248>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DF1C048>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E9F9F08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66B69C288>,\n",
       "        <email.message.EmailMessage object at 0x000001C66CE45C88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D2BFF08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DE27588>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E936808>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E2EC188>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EA44AC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DE96908>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E358C48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E35F048>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D747A88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E37AA48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E1BB908>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DB78E88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E760C48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E1FDE08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DE0EA08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E0E6D08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E234A88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E8FFD08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66B5F98C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D6FC248>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DDADA88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DE65588>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D238208>,\n",
       "        <email.message.EmailMessage object at 0x000001C66CE23EC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E617D08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E7B9D08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E913C08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EB4FC48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E064908>,\n",
       "        <email.message.EmailMessage object at 0x000001C66CE55948>,\n",
       "        <email.message.EmailMessage object at 0x000001C66A03D508>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DD7F148>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DDC4D48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E3D7208>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DAC9C08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E73CFC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66CE72788>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DF49C08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E64CB08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E76E988>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E77D908>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DCD4508>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DCE1B88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E5FA548>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DF7C608>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EA26A88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E6FD848>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E9E3908>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DEAD648>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D368608>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E8A75C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E815388>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E422C48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66B37BF08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E9A7B88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E970B88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E834C08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E82CF88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DC14348>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E9C01C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66B9473C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EA57DC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E03A488>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E392FC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E8E2248>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E36FC88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E356608>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D39FD08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EA7D8C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E5C6B08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EA6DBC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E63A048>,\n",
       "        <email.message.EmailMessage object at 0x000001C66B6B9308>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DCE10C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E34DA88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66CC53688>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DEC37C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E6BCF88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66CFEA208>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DF10808>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E76E608>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DD94608>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E2E4F48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EB25FC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DB20688>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E96B4C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66CFC6088>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DCB4F48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D01A688>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E0DF1C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D768C48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E5B20C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D378E88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EA94D88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E8ECC48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D8A4848>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DFE8A08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E069DC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E8A1C88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E5A80C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E8D84C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E98F948>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DDBD448>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DED7F88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E8621C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E87AEC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DAD6B08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E885E08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E424848>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E70CC88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DABA708>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E7DD288>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DF72248>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E9EB548>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E147D08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E064D48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DCADB48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EAB9908>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E85FF88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E16CA08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DF19A08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E41A248>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EA5D048>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E85F548>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E3F9CC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E6C3CC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E9A9148>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E79E708>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DEE5FC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DEBCF48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DF94388>,\n",
       "        <email.message.EmailMessage object at 0x000001C66CDA9F48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E11E948>,\n",
       "        <email.message.EmailMessage object at 0x000001C66CDA4F08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E794A08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66CE451C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66CDDE188>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E769788>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DB34448>,\n",
       "        <email.message.EmailMessage object at 0x000001C66CEF2488>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E095448>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E9F78C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E802C88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DCB9108>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E782408>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DC0EF48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DE5E7C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E58BF48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E5F6C48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DB26848>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E7AF648>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DD94748>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DBDD4C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E9B88C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E54D508>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E324D88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E020088>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E16C708>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E090108>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DDCD788>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DEA5208>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DE2E048>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EB17B08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DD53388>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D730548>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E441188>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E47DA88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E464788>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E4F4408>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E91FC88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EA65D48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E19DE48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DB51708>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E880508>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E9E98C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E6F2A88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EA378C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DD28A48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E9698C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D6F2088>,\n",
       "        <email.message.EmailMessage object at 0x000001C66B5FF1C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66B484348>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D377988>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E2A4F88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D6BC448>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E6491C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E6E5588>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DBA5308>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E392E08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DBB1C48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DD9FF48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DB8B8C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E5EEFC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DDFD408>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DAA22C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DAF4088>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E56DD88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E85FFC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66B5AD148>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E162F48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DC70FC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EB0B8C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DFCF648>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E9A5C48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E992788>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DC38F88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66CD3B848>,\n",
       "        <email.message.EmailMessage object at 0x000001C66BBE9108>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EAEC888>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D6CC088>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E476748>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E2BA988>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EB028C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DB61AC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E85E288>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EA98D88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EAB2C08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EAE1588>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E9B8B08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EAC9808>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DB41D48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D752688>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E0C8308>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E894988>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D74DD48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D39EEC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E8F8F48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E109A48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DE84A88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EA19948>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DDB9948>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D19CF88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E862708>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DF36E88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E1D0F08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66B5985C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DBEFB08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E908DC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EAA0988>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E2B6EC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66CE40C88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E1A2C88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66B4EFC48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DB95EC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DED7E88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E9317C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E09A648>,\n",
       "        <email.message.EmailMessage object at 0x000001C66CF0E6C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E1E8F08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DE88EC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E63FD08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E7FA688>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DC5C148>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E1568C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E85FA08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E0EC308>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E1CC288>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E7FE4C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66CDB1588>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EA6DB88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E8F7F08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66CCF5188>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E970348>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E350BC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E52EA08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D68DBC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DB84F08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EAA2C88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E6B6F48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D004EC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E617B08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E496648>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EA23F48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E85BD48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DD2D248>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EA498C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E107888>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E549CC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DD06EC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EAB2648>,\n",
       "        <email.message.EmailMessage object at 0x000001C66B54D0C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E2234C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E8C6748>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EA9FAC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DE49C88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E81BE88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DBDAA08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EB1E7C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DF26808>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DBFDFC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DD21D88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DEFA208>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D346908>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E9F1EC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E967288>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E106288>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DD88508>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E90A988>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E7EFF08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D8A46C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E4EAE08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EAC2E48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DFCEB88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66B623E48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D6B9C08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E45CAC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66EA22F48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DD71A48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D8DFFC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E0984C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E335488>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DBC8F08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DC79E88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DC0A708>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D012E88>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D648A08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DAF7E08>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E7C3EC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66B76D288>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E7CD988>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D87AA48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E8D5908>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D28C708>,\n",
       "        <email.message.EmailMessage object at 0x000001C66D630548>,\n",
       "        <email.message.EmailMessage object at 0x000001C66CC515C8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E617C48>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E5E2DC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DB07DC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E9A1CC8>,\n",
       "        <email.message.EmailMessage object at 0x000001C66DF25188>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E868908>,\n",
       "        <email.message.EmailMessage object at 0x000001C66E577188>],\n",
       "       dtype=object),\n",
       " array([0, 0, 0, ..., 0, 0, 0]),\n",
       " array([0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0,\n",
       "        0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,\n",
       "        0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,\n",
       "        0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "        1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,\n",
       "        0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0,\n",
       "        0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,\n",
       "        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,\n",
       "        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,\n",
       "        0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "        0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,\n",
       "        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,\n",
       "        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,\n",
       "        0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "        0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,\n",
       "        0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,\n",
       "        0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,\n",
       "        0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,\n",
       "        0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,\n",
       "        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0,\n",
       "        0, 0, 0, 0, 0, 0]))"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train,X_test,y_train,y_test"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 首先需要一个函数来将html转换为纯文本，使用[Beautifulsoup]库，下面的函数首先删除`<head>`部分，然后将所有`<a>`标记转换为单词hyperlink，然后去掉所有html标记，只留下纯文本。为了可读性，它还用一个换行符替换多个换行符，最后它取消了HTML实体（例如`&gt；`或`&nbsp；`）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re \n",
    "from html import unescape\n",
    "\n",
    "def html_to_plain_text(html):\n",
    "    text=re.sub('<head.*?>.*?</head>','',html,flags=re.M | re.S | re.I)\n",
    "    text = re.sub('<a\\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)\n",
    "    text = re.sub('<.*?>', '', text, flags=re.M | re.S)\n",
    "    text = re.sub(r'(\\s*\\n)+', '\\n', text, flags=re.M | re.S)\n",
    "    return unescape(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<HTML><HEAD><TITLE></TITLE><META http-equiv=\"Content-Type\" content=\"text/html; charset=windows-1252\"><STYLE>A:link {TEX-DECORATION: none}A:active {TEXT-DECORATION: none}A:visited {TEXT-DECORATION: none}A:hover {COLOR: #0033ff; TEXT-DECORATION: underline}</STYLE><META content=\"MSHTML 6.00.2713.1100\" name=\"GENERATOR\"></HEAD>\n",
      "<BODY text=\"#000000\" vLink=\"#0033ff\" link=\"#0033ff\" bgColor=\"#CCCC99\"><TABLE borderColor=\"#660000\" cellSpacing=\"0\" cellPadding=\"0\" border=\"0\" width=\"100%\"><TR><TD bgColor=\"#CCCC99\" valign=\"top\" colspan=\"2\" height=\"27\">\n",
      "<font size=\"6\" face=\"Arial, Helvetica, sans-serif\" color=\"#660000\">\n",
      "<b>OTC</b></font></TD></TR><TR><TD height=\"2\" bgcolor=\"#6a694f\">\n",
      "<font size=\"5\" face=\"Times New Roman, Times, serif\" color=\"#FFFFFF\">\n",
      "<b>&nbsp;Newsletter</b></font></TD><TD height=\"2\" bgcolor=\"#6a694f\"><div align=\"right\"><font color=\"#FFFFFF\">\n",
      "<b>Discover Tomorrow's Winners&nbsp;</b></font></div></TD></TR><TR><TD height=\"25\" colspan=\"2\" bgcolor=\"#CCCC99\"><table width=\"100%\" border=\"0\"  ...\n"
     ]
    }
   ],
   "source": [
    "html_spam_emails=[email for email in X_train[y_train==1]\n",
    "                  if get_email_structure(email) ==\"text/html\"]\n",
    "\n",
    "sample_html_spam=html_spam_emails[7]\n",
    "print(sample_html_spam.get_content().strip()[:1000],\"...\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "OTC\n",
      " Newsletter\n",
      "Discover Tomorrow's Winners \n",
      "For Immediate Release\n",
      "Cal-Bay (Stock Symbol: CBYI)\n",
      "Watch for analyst \"Strong Buy Recommendations\" and several advisory newsletters picking CBYI.  CBYI has filed to be traded on the OTCBB, share prices historically INCREASE when companies get listed on this larger trading exchange. CBYI is trading around 25 cents and should skyrocket to $2.66 - $3.25 a share in the near future.\n",
      "Put CBYI on your watch list, acquire a position TODAY.\n",
      "REASONS TO INVEST IN CBYI\n",
      "A profitable company and is on track to beat ALL earnings estimates!\n",
      "One of the FASTEST growing distributors in environmental & safety equipment instruments.\n",
      "Excellent management team, several EXCLUSIVE contracts.  IMPRESSIVE client list including the U.S. Air Force, Anheuser-Busch, Chevron Refining and Mitsubishi Heavy Industries, GE-Energy & Environmental Research.\n",
      "RAPIDLY GROWING INDUSTRY\n",
      "Industry revenues exceed $900 million, estimates indicate that there could be as much as $25 billi ...\n"
     ]
    }
   ],
   "source": [
    "print(html_to_plain_text(sample_html_spam.get_content())[:1000], \"...\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 编写一个函数，它以电子邮件为输入，并以纯文本形式返回其内容，无论其格式是什么\n",
    "def email_to_text(email):\n",
    "    html=None\n",
    "    for part in email.walk():\n",
    "        ctype=part.get_content_type()\n",
    "        if not ctype in (\"text/plain\",\"text/html\"):\n",
    "            continue\n",
    "        try:\n",
    "            content=part.get_content()\n",
    "        except: #解决编码问题\n",
    "            content=str(part.get_payload())\n",
    "        if ctype ==\"text/plain\":\n",
    "            return content\n",
    "        else:\n",
    "            html=content\n",
    "    if html:\n",
    "        return html_to_plain_text(html)\n",
    "            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "OTC\n",
      " Newsletter\n",
      "Discover Tomorrow's Winners \n",
      "For Immediate Release\n",
      "Cal-Bay (Stock Symbol: CBYI)\n",
      "Wat ...\n"
     ]
    }
   ],
   "source": [
    "print(email_to_text(sample_html_spam)[:100],\"...\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 如果中文就用结巴分词（jieba）\n",
    "# 装nltk,urlextract\n",
    "import nltk\n",
    "from urlextract import URLExtract\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 将所有处理整合到一个转换器中，我们将使用它将电子邮件转换为文字计数器。注意，我们使用python的'split（）'方法将句子拆分为单词，该方法使用空格作为单词边界。但例如，汉语和日语脚本通常不在单词之间使用空格在这个练习中没关系，因为数据集（主要）是英文的，中文可以使用结巴分词来进行拆分"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "#定义转换器\n",
    "from sklearn.base import BaseEstimator,TransformerMixin\n",
    "\n",
    "class EmailToWordCounterTransformer(BaseEstimator,TransformerMixin):\n",
    "    def __init__(self,strip_headers=True,lower_case=True,remove_punctuation=True,\n",
    "                 replace_urls=True,replace_numbers=True,stemming=True):\n",
    "        self.strip_headers=strip_headers\n",
    "        self.lower_case=lower_case\n",
    "        self.remove_punctuation=remove_punctuation\n",
    "        self.replace_urls=replace_urls\n",
    "        self.replace_numbers=replace_numbers\n",
    "        self.stemming=stemming\n",
    "    def fit(self,X,y=None):\n",
    "        return self\n",
    "    def transform(self,X,y=None):\n",
    "        X_transformed=[]\n",
    "        for email in X:\n",
    "            text=email_to_text(email) or \"\"\n",
    "            if self.lower_case:\n",
    "                text=text.lower()\n",
    "            if self.replace_urls:\n",
    "                extractor=URLExtract()\n",
    "                urls=list(set(extractor.find_urls(text)))\n",
    "                urls.sort(key=lambda url:len(url),reverse=True)\n",
    "                for url in urls:#替换url为“URL”\n",
    "                    text=text.replace(url,\"URL\")\n",
    "            if self.replace_numbers: #替换数字\n",
    "                text = re.sub(r'\\d+(?:\\.\\d*(?:[eE]\\d+))?', 'NUMBER', text)\n",
    "            if self.remove_punctuation:#删除标点符号\n",
    "                text = re.sub(r'\\W+', ' ', text, flags=re.M)\n",
    "            word_counts=Counter(text.split())\n",
    "            X_transformed.append(word_counts)\n",
    "        return np.array(X_transformed)\n",
    "                \n",
    "        \n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([Counter({'chuck': 1, 'murcko': 1, 'wrote': 1, 'stuff': 1, 'yawn': 1, 'r': 1}),\n",
       "       Counter({'the': 11, 'of': 9, 'and': 8, 'all': 3, 'christianity': 3, 'to': 3, 'by': 3, 'jefferson': 2, 'i': 2, 'have': 2, 'one': 2, 'on': 2, 'been': 2, 'has': 2, 'half': 2, 'jesus': 2, 'some': 1, 'interesting': 1, 'quotes': 1, 'URL': 1, 'thomas': 1, 'examined': 1, 'known': 1, 'superstitions': 1, 'word': 1, 'do': 1, 'not': 1, 'find': 1, 'in': 1, 'our': 1, 'particular': 1, 'superstition': 1, 'redeeming': 1, 'feature': 1, 'they': 1, 'are': 1, 'alike': 1, 'founded': 1, 'fables': 1, 'mythology': 1, 'millions': 1, 'innocent': 1, 'men': 1, 'women': 1, 'children': 1, 'since': 1, 'introduction': 1, 'burnt': 1, 'tortured': 1, 'fined': 1, 'imprisoned': 1, 'what': 1, 'effect': 1, 'this': 1, 'coercion': 1, 'make': 1, 'world': 1, 'fools': 1, 'other': 1, 'hypocrites': 1, 'support': 1, 'roguery': 1, 'error': 1, 'over': 1, 'earth': 1, 'six': 1, 'historic': 1, 'americans': 1, 'john': 1, 'e': 1, 'remsburg': 1, 'letter': 1, 'william': 1, 'short': 1, 'again': 1, 'become': 1, 'most': 1, 'perverted': 1, 'system': 1, 'that': 1, 'ever': 1, 'shone': 1, 'man': 1, 'rogueries': 1, 'absurdities': 1, 'untruths': 1, 'were': 1, 'perpetrated': 1, 'upon': 1, 'teachings': 1, 'a': 1, 'large': 1, 'band': 1, 'dupes': 1, 'importers': 1, 'led': 1, 'paul': 1, 'first': 1, 'great': 1, 'corrupter': 1, 'teaching': 1}),\n",
       "       Counter({'URL': 4, 's': 3, 'to': 3, 'in': 2, 'forteana': 2, 'martin': 2, 'an': 2, 'and': 2, 'we': 2, 'is': 2, 'yahoo': 2, 'groups': 2, 'unsubscribe': 2, 'y': 1, 'adamson': 1, 'wrote': 1, 'for': 1, 'alternative': 1, 'rather': 1, 'more': 1, 'factually': 1, 'based': 1, 'rundown': 1, 'on': 1, 'hamza': 1, 'career': 1, 'including': 1, 'his': 1, 'belief': 1, 'that': 1, 'all': 1, 'non': 1, 'muslims': 1, 'yemen': 1, 'should': 1, 'be': 1, 'murdered': 1, 'outright': 1, 'know': 1, 'how': 1, 'unbiased': 1, 'memri': 1, 'don': 1, 't': 1, 'html': 1, 'rob': 1, 'sponsor': 1, 'NUMBER': 1, 'dvds': 1, 'free': 1, 'p': 1, 'join': 1, 'now': 1, 'from': 1, 'this': 1, 'group': 1, 'send': 1, 'email': 1, 'egroups': 1, 'com': 1, 'your': 1, 'use': 1, 'of': 1, 'subject': 1})],\n",
       "      dtype=object)"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#在一些邮件上测试转换器\n",
    "X_few=X_train[:3]\n",
    "X_few_wordcounts=EmailToWordCounterTransformer().fit_transform(X_few)\n",
    "X_few_wordcounts"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 有了单词计数，我们需要把它们转换成向量。为此，我们将构建另一个转换器，其“fit（）”方法将构建词汇表（最常用单词的有序列表），其“transform（）”方法将使用词汇表将单词计数转换为向量。输出是稀疏矩阵"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 将词汇量表转换成稀疏矩阵\n",
    "from scipy.sparse import csr_matrix\n",
    "class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):\n",
    "    def __init__(self,vocabulary_size=1000):\n",
    "        self.vocabulary_size=vocabulary_size # 词汇量\n",
    "    def fit(self,X,y=None):\n",
    "        total_count=Counter()\n",
    "        for word_count in X:\n",
    "            for word,count in word_count.items():\n",
    "                total_count[word]+=min(count,10)\n",
    "        most_common=total_count.most_common()[:self.vocabulary_size]\n",
    "        self.most_common=most_common;\n",
    "        self.vocabulary_={word:index+1 for index,(word,count) in enumerate(most_common)}\n",
    "        return self\n",
    "    def transform(self,X,y=None):\n",
    "        rows=[]\n",
    "        cols=[]\n",
    "        data=[]\n",
    "        for row,word_count in enumerate(X):\n",
    "            for word,count in word_count.items():\n",
    "                rows.append(row)\n",
    "                cols.append(self.vocabulary_.get(word,0))\n",
    "                data.append(count)\n",
    "        return csr_matrix((data,(rows,cols)),shape=(len(X),self.vocabulary_size+1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<3x11 sparse matrix of type '<class 'numpy.int32'>'\n",
       "\twith 20 stored elements in Compressed Sparse Row format>"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab_transformer=WordCounterToVectorTransformer(vocabulary_size=10)\n",
    "X_few_vectors=vocab_transformer.fit_transform(X_few_wordcounts)\n",
    "X_few_vectors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],\n",
       "       [99, 11,  9,  8,  3,  1,  3,  1,  3,  2,  3],\n",
       "       [67,  0,  1,  2,  3,  4,  1,  2,  0,  1,  0]], dtype=int32)"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_few_vectors.toarray()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 第三行第一列中的67表示第三封电子邮件包含64个不属于词汇表的单词。旁边的1表示词汇表中的第一个单词在此电子邮件中出现一次。旁边的2表示第二个单词出现两次，依此类推。看一下词汇表。第一个词是“of”，第二个词是“and”等"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'the': 1,\n",
       " 'of': 2,\n",
       " 'and': 3,\n",
       " 'to': 4,\n",
       " 'URL': 5,\n",
       " 'all': 6,\n",
       " 'in': 7,\n",
       " 'christianity': 8,\n",
       " 'on': 9,\n",
       " 'by': 10}"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab_transformer.vocabulary_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 我们现在准备训练我们的第一个垃圾邮件分类器！让我们转换整个数据集："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 流水线\n",
    "from sklearn.pipeline import Pipeline\n",
    "\n",
    "preprocess_pipeline = Pipeline([\n",
    "    (\"email_to_wordcount\",EmailToWordCounterTransformer()),\n",
    "    (\"wordcount_to_vector\",WordCounterToVectorTransformer()),\n",
    "])\n",
    "\n",
    "X_train_transformed=preprocess_pipeline.fit_transform(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
      "[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s\n",
      "[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[CV]  ................................................................\n",
      "[CV] .................................... , score=0.983, total=   0.1s\n",
      "[CV]  ................................................................\n",
      "[CV] .................................... , score=0.985, total=   0.0s\n",
      "[CV]  ................................................................\n",
      "[CV] .................................... , score=0.993, total=   0.2s\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.3s finished\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "0.9866666666666667"
      ]
     },
     "execution_count": 72,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#逻辑回归\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import cross_val_score #交叉验证\n",
    "\n",
    "log_clf=LogisticRegression(solver=\"liblinear\",random_state=42) #交叉验证\n",
    "score=cross_val_score(log_clf,X_train_transformed,y_train,cv=3,verbose=3)\n",
    "score.mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 得到分数超过98.7%，可以尝试多个模型，选择最好的模型，并使用交叉验证对它们进行微调。在测试集上得到的精度/召回率："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "精度:96.88%\n",
      "召回:97.89%\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "c:\\users\\administrator\\appdata\\local\\programs\\python\\python37\\lib\\site-packages\\sklearn\\svm\\base.py:929: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
      "  \"the number of iterations.\", ConvergenceWarning)\n"
     ]
    }
   ],
   "source": [
    "# 获得精度和召回率\n",
    "from sklearn.metrics import precision_score,recall_score\n",
    "\n",
    "X_test_transformed=preprocess_pipeline.transform(X_test)\n",
    "\n",
    "log_clf=LogisticRegression(solver=\"liblinear\",random_state=42)\n",
    "log_clf.fit(X_train_transformed,y_train)\n",
    "\n",
    "y_pred=log_clf.predict(X_test_transformed)  #获得类别标签\n",
    "\n",
    "precision_score=precision_score(y_test,y_pred) #精度\n",
    "recall_score=recall_score(y_test,y_pred) #召回\n",
    "\n",
    "print(\"精度:{:.2f}%\".format(100 * precision_score))\n",
    "print(\"召回:{:.2f}%\".format(100 * recall_score))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 总结\n",
    "1. 加载数据并纵观数据大局\n",
    "2. 获取邮件的组成结构\n",
    "3. 对结构类型进行分析 发现垃圾邮件大多有HTML结构\n",
    "4. 数据清洗，定义email对象中的HTML转换称纯文本方法\n",
    "5. 对数据集拆分成训练集和测试集\n",
    "6. 数据处理转换，对邮件的文本内容进行分词处理，通过nltk进行词干提取，对邮件出现的词汇进行计数统计，对所有邮件统计出了一个词汇表\n",
    "7. 通过词汇表和邮件单词计数统计，将单词计数转化成向量矩阵\n",
    "8. 把数据清洗和数据处理封装成两个转换器\n",
    "9. 通过流水线来自动化处理数据\n",
    "10. 使用逻辑回归线性分类器进行模型训练\n",
    "11. 使用交叉验证进行微调\n",
    "12. 在测试集上得到精度/召回率"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
